counts = read.table("filter_star_counts.txt", header = TRUE,row.names = 1)
new_colnames <- c(
"N01", "N02", "N03", "N04",
"A2_R01", "A2_R02", "A2_R03", "A2_R04",
"A24_R01", "A24_R02", "A24_R03", "A24_R04")
colnames(counts) = new_colnames
# colnames(counts)
conditions <- factor(c(rep("Naive", 4), rep("Allo2h", 4), rep("Allo24h", 4)))
coldata <- data.frame(row.names=colnames(counts), condition=conditions)
# coldata
# check, suppose to get TRUE,indicates alignment between design and real data
all(rownames(coldata) %in% colnames(counts))
## [1] TRUE
rownames(coldata) == colnames(counts)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
# design DESeq matrix
dds = DESeqDataSetFromMatrix(countData = counts,
colData = coldata,
design = ~ condition)
dds = DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
length_table = read.table("S2_STAR2_practice_featurecounts.txt", row.names = 1, header = TRUE, sep = "\t")
# head(length_table)
mcols(dds)$basepairs = length_table[rownames(dds),"Length"]
normalized_counts <- counts(dds, normalized = TRUE)
cpm_values <- cpm(normalized_counts)
gene_lengths_kb <- mcols(dds)$basepairs / 1000
# Function to calculate RPKM from CPM and gene lengths
calculate_rpkm_from_cpm <- function(cpm_values, gene_lengths_kb) {
rpkm <- sweep(cpm_values, 1, gene_lengths_kb, "/")
return(rpkm)
}
rpkm <- calculate_rpkm_from_cpm(cpm_values, gene_lengths_kb)
log2_rpkm = log2(rpkm+1)
pca <- prcomp(t(log2_rpkm))
# PCA from rpkm normalized data
pca_data <- data.frame(PC1 = pca$x[,1], PC2 = pca$x[,2], condition = conditions)
ggplot(pca_data, aes(PC1, PC2, color = condition)) +
geom_point(size = 3) +
ggtitle("PCA of Alveolar Macrophage Samples") +
xlab(paste0("PC1: ", round(summary(pca)$importance[2,1] * 100, 1), "% variance")) +
ylab(paste0("PC2: ", round(summary(pca)$importance[2,2] * 100, 1), "% variance"))
vsd = vst(dds,blind = FALSE)
## another PCA plot
pcaData <- plotPCA(vsd, intgroup="condition", returnData=TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=condition)) +
geom_point(size=3) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed()
# Compute Pearson correlation
cor_matrix <- cor(rpkm, method = "pearson")
# Create correlation pearson heatmap
pheatmap(cor_matrix,
clustering_distance_rows = "correlation",
clustering_distance_cols = "correlation",
display_numbers = TRUE,
fontsize = 7,
main = "Pearson's Correlation Heatmap")
# compute sample to sample distance
sampleDists = dist(t(assay(vsd)))
# print(sampleDists)
library("RColorBrewer")
sampleDistMatrix <- as.matrix(sampleDists)
# sample to sampe distance plot
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col=colors)
## Filtering out Noise
rpkm_df = as.data.frame(rpkm)
cutoffs = seq(0, 10, by=0.25)
num_genes_above_cutoff = sapply(cutoffs, function(cutoff) {
colSums(rpkm_df > cutoff)
})
# Convert the results to a data frame and transpose it
num_genes_above_cutoff_df = as.data.frame(t(num_genes_above_cutoff))
# Set the column names of the transposed data frame
colnames(num_genes_above_cutoff_df) = colnames(rpkm_df)
# Add the cutoff values as a new column
num_genes_above_cutoff_df$Cutoff = cutoffs
# Reshape the data frame to long format for plotting
df_num_melted = melt(num_genes_above_cutoff_df, id.vars = "Cutoff", variable.name = "Sample", value.name = "Number_of_Genes")
# Plot the number of genes above different cutoffs
ggplot(df_num_melted, aes(x = Cutoff, y = Number_of_Genes, color = Sample)) +
geom_line() +
geom_point() +
theme_minimal() +
xlab("RPKM Cutoff") +
ylab("Number of Genes") +
ggtitle("Number of Genes Above Different RPKM Cutoffs")
raw_counts = as.data.frame(normalized_counts)
plot(log2(rpkm_df$N01 + 1), log2(rpkm_df$N03+1),title("N1 vs N3"))
# set filter threshold as rpkm = 1
keep = (rowSums(rpkm) >= 6)
dds_filtered <- dds[keep,]
dds_filtered
## class: DESeqDataSet
## dim: 12908 12
## metadata(1): version
## assays(4): counts mu H cooks
## rownames(12908): TrnP TrnT ... Lypla1 Mrpl15
## rowData names(27): baseMean baseVar ... maxCooks basepairs
## colnames(12): N01 N02 ... A24_R03 A24_R04
## colData names(2): condition sizeFactor
##MA plot
res_1 = results(dds_filtered, contrast =c ("condition","Allo2h","Naive"))
# use plotMA function
DESeq2::plotMA(res_1, ylim = c(-5, 5))
# use ggplot for beautiful plot
res_df <- as.data.frame(res_1)
# Add a column for color based on log2FoldChange and threshold of padj
res_df$color <- ifelse(res_df$padj < 0.1, ifelse(res_df$log2FoldChange < 0, "red", "blue"), "grey")
res_df$logBaseMean <- log2(res_df$baseMean + 1)
# Add a column for log-transformed baseMean
ggplot(res_df, aes(x =logBaseMean, y = log2FoldChange, color = color)) +
geom_point(alpha = 0.4) +
ylim(c(-5, 5)) +
scale_color_identity() +
xlab("Average log2 baseMean") +
ylab("Log2 Fold Change") +
ggtitle("MA Plot with Negative Log2 Fold Change in Red") +
theme(legend.position = "none")
## Warning: Removed 538 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Volcanol plot
res1 = results(dds,contrast = c("condition","Naive","Allo2h"))
p <- EnhancedVolcano(res_1,
lab = rownames(res_1),
title = "Naive vs.Allo2h",
x = 'log2FoldChange',
y = 'pvalue',
pointSize = 2.0,
labSize = 4.0)
print(p)
# dds_filtered
num_genes = length(dds_filtered)
num_genes
## [1] 12908
anova_table = as.data.frame(counts(dds_filtered))
anova_table$Gene = rownames(anova_table)
# head(anova_table)
Naive = as.data.frame(anova_table[,c("Gene","N01","N02","N03","N04")])
A2 = as.data.frame(anova_table[,c("Gene","A2_R01","A2_R02","A2_R03","A2_R04")])
A24 = as.data.frame(anova_table[,c("Gene","A24_R01","A24_R02","A24_R03","A24_R04")])
Naive_long = melt(Naive, varnames = c("Gene", "Sample"), value.name = "Expression")
## Using Gene as id variables
A2_long = melt(A2, varnames = c("Gene", "Sample"), value.name = "Expression")
## Using Gene as id variables
A24_long = melt(A24, varnames = c("Gene", "Sample"), value.name = "Expression")
## Using Gene as id variables
Naive_long$Group = "Naive"
A24_long$Group = "A24"
A2_long$Group = "A2"
all_long = bind_rows(Naive_long, A2_long,A24_long)
# Function to perform ANOVA for a single gene
perform_anova = function(df) {
aov_result = aov(Expression ~ Group, data = df)
p_value = summary(aov_result)[[1]]["Group", "Pr(>F)"]
return(p_value)
}
# Initialize a data frame to store results
anova_results = data.frame(Gene = character(), p_value = numeric(), stringsAsFactors = FALSE)
# Loop through each gene and perform ANOVA
for (gene in unique(all_long$Gene)) {
gene_data = all_long %>% filter(Gene == gene)
p_value <- perform_anova(gene_data)
anova_results <- rbind(anova_results, data.frame(Gene = gene, p_value = p_value))
}
# head(anova_results)
pvalue_all= anova_results$p_value
hist(pvalue_all, breaks = 50, ylim = c(0, 6000))
anova_results <- anova_results %>%
mutate(adj_p_value = p.adjust(p_value,method = "BH"))
# Filter significant results
significant_genes <- anova_results %>%
filter(adj_p_value <= 0.05)
length(rownames(significant_genes))
## [1] 5770
# Function to calculate z-scores
cal_zscore = function(df) {
df$Expression = as.numeric(df$Expression)
df$z_score = scale(df$Expression)
return(df) # Ensure the modified dataframe is returned
}
# Initialize a data frame to store results
results_all <- data.frame(Gene = character(), variable = numeric(), z_score = numeric(), stringsAsFactors = FALSE)
# Loop through each gene and calculate z-scores
for (gene in unique(significant_genes$Gene)) {
gene_data <- all_long %>% filter(Gene == gene)
gene_data <- cal_zscore(gene_data)
# Store results
results_all <- rbind(results_all, gene_data)
}
# results_all
# Spread the data to wide format
heatmap_data <- results_all %>%
select(Gene, variable, z_score) %>%
spread(key = variable, value = z_score)
# Set rownames to be the genes
rownames(heatmap_data) <- heatmap_data$Gene
heatmap_data <- heatmap_data[, -1]
# Perform k-means clustering
set.seed(123) # For reproducibility
kmeans_result = kmeans(heatmap_data, centers = 6,
nstart = 2500,
iter.max = 2000)
kmeans_data = heatmap_data[order(kmeans_result$cluster),]
# Create a heatmap with the clustered data
pheatmap(kmeans_data,
cluster_cols = FALSE,
cluster_rows = TRUE,
show_rownames = FALSE)
# annotation_row = data.frame(Cluster = as.factor(kmeans_result$cluster)))
gene_list = dds_filtered[significant_genes$Gene,]
gene_list_df = results(gene_list)
GO_list = gene_list_df$log2FoldChange
names(GO_list) = rownames(gene_list_df)
GO_list = sort(GO_list, decreasing = TRUE)
anyDuplicated(names(GO_list))
## [1] 0
# GO_list = gene_list[!duplicated(names(GO_list))]
# significant_genes
ego <- enrichGO(gene = names(GO_list),
OrgDb = org.Mm.eg.db,
keyType = "SYMBOL",
ont = "ALL",
pAdjustMethod = "BH",
pvalueCutoff = 0.01,
qvalueCutoff = 0.05,
readable = TRUE)
print(ego@result$ID[1:10],)
## [1] "GO:0022613" "GO:0007059" "GO:0008380" "GO:0006260" "GO:0034470"
## [6] "GO:0042254" "GO:0044772" "GO:0033044" "GO:0000819" "GO:0098813"
dotplot(ego, showCategory = 10, title = "GO")
## GSEA
gsea_results <- gseGO(geneList = GO_list,
OrgDb = org.Mm.eg.db,
ont = "ALL",
keyType = "SYMBOL",
exponent = 1,
pvalueCutoff = 0.05,
verbose = TRUE,
seed = TRUE)
## using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
## preparing geneSet collections...
## GSEA analysis...
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : There were 58 pathways for which P-values were not calculated
## properly due to unbalanced (positive and negative) gene-level statistic values.
## For such pathways pval, padj, NES, log2err are set to NA. You can try to
## increase the value of the argument nPermSimple (for example set it nPermSimple
## = 10000)
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some of the pathways the P-values were likely overestimated. For
## such pathways log2err is set to NA.
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some pathways, in reality P-values are less than 1e-10. You can
## set the `eps` argument to zero for better estimation.
## leading edge analysis...
## done...
dotplot(gsea_results, showCategory=5, split=".sign") + facet_grid(.~.sign)
cnetplot(gsea_results, categorySize="pvalue", foldChange=GO_list, showCategory = 3)
## Warning in cnetplot.enrichResult(x, ...): Use 'color.params = list(foldChange = your_value)' instead of 'foldChange'.
## The foldChange parameter will be removed in the next version.
## Scale for size is already present.
## Adding another scale for size, which will replace the existing scale.
ridgeplot(gsea_results) + labs(x = "enrichment distribution")
## Picking joint bandwidth of 0.177
gse <- pairwise_termsim(gsea_results)
emapplot(gse, showCategory = 10)
gseaplot(gsea_results, by = "all", title = gse$Description[1], geneSetID = 1)
# Extract gene lists for each cluster
gene_clusters <- split(rownames(kmeans_data), kmeans_result$cluster)
enrichment_results <- list()
# Perform functional enrichment analysis on each gene list
for (i in 1:6) {
cluster_genes <- gene_clusters[[i]]
cluster_genes_entrez <- bitr(cluster_genes, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Mm.eg.db)
cluster_genes_entrez <- cluster_genes_entrez$ENTREZID
ego <- enrichGO(gene = cluster_genes_entrez,
OrgDb = org.Mm.eg.db,
keyType = "ENTREZID",
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.2)
enrichment_results[[i]] <- ego
p <- dotplot(ego, showCategory = 10)
print(p)
}
## 'select()' returned 1:1 mapping between keys and columns
## Warning in bitr(cluster_genes, fromType = "SYMBOL", toType = "ENTREZID", :
## 2.03% of input gene IDs are fail to map...
## 'select()' returned 1:1 mapping between keys and columns
## Warning in bitr(cluster_genes, fromType = "SYMBOL", toType = "ENTREZID", :
## 2.57% of input gene IDs are fail to map...
## 'select()' returned 1:1 mapping between keys and columns
## Warning in bitr(cluster_genes, fromType = "SYMBOL", toType = "ENTREZID", :
## 2.07% of input gene IDs are fail to map...
## 'select()' returned 1:1 mapping between keys and columns
## Warning in bitr(cluster_genes, fromType = "SYMBOL", toType = "ENTREZID", :
## 0.99% of input gene IDs are fail to map...
## 'select()' returned 1:1 mapping between keys and columns
## Warning in bitr(cluster_genes, fromType = "SYMBOL", toType = "ENTREZID", :
## 2.92% of input gene IDs are fail to map...
## 'select()' returned 1:1 mapping between keys and columns
## Warning in bitr(cluster_genes, fromType = "SYMBOL", toType = "ENTREZID", :
## 2.87% of input gene IDs are fail to map...
enrichment_results[[1]]
## #
## # over-representation test
## #
## #...@organism Mus musculus
## #...@ontology BP
## #...@keytype ENTREZID
## #...@gene chr [1:820] "73558" "320469" "11303" "18671" "66190" "23797" "11658" ...
## #...pvalues adjusted by 'BH' with cutoff <0.05
## #...428 enriched terms found
## 'data.frame': 428 obs. of 9 variables:
## $ ID : chr "GO:0140053" "GO:0032543" "GO:0022613" "GO:0007059" ...
## $ Description: chr "mitochondrial gene expression" "mitochondrial translation" "ribonucleoprotein complex biogenesis" "chromosome segregation" ...
## $ GeneRatio : chr "33/750" "28/750" "47/750" "45/750" ...
## $ BgRatio : chr "167/28905" "130/28905" "442/28905" "419/28905" ...
## $ pvalue : num 7.60e-20 4.36e-18 3.16e-16 9.49e-16 4.57e-15 ...
## $ p.adjust : num 3.46e-16 9.93e-15 4.80e-13 1.08e-12 4.16e-12 ...
## $ qvalue : num 2.83e-16 8.12e-15 3.93e-13 8.84e-13 3.40e-12 ...
## $ geneID : chr "211064/66971/229487/228019/68537/68463/68611/94065/67270/69163/18120/66493/66242/66258/74238/71701/105787/76781"| __truncated__ "211064/66971/229487/228019/68537/68463/68611/94065/67270/69163/18120/66493/66242/66258/66084/69527/94254/75619/"| __truncated__ "217869/14645/66282/66596/226432/67493/69163/18150/68219/101739/67148/69126/236732/57808/56505/75062/110809/1035"| __truncated__ "12877/83431/232933/12448/107995/52563/67849/72155/76959/15183/73804/208628/17120/17350/17865/54392/76044/215387"| __truncated__ ...
## $ Count : int 33 28 47 45 29 31 37 44 40 32 ...
## #...Citation
## T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu.
## clusterProfiler 4.0: A universal enrichment tool for interpreting omics data.
## The Innovation. 2021, 2(3):100141
top_terms <- sapply(enrichment_results, function(x) {
if (nrow(x) > 0) {
return(x@result$Description[1])
} else {
return("No significant term")
}
})
# Create a data frame for annotation
annotation_df <- data.frame(Cluster = factor(1:6), GO_Process = top_terms)
pheatmap(kmeans_data,
cluster_cols = FALSE,
cluster_rows = FALSE,
show_rownames = FALSE,
annotation_row = data.frame(Cluster = as.factor(kmeans_result$cluster)))
annotation_df
## Cluster GO_Process
## 1 1 mitochondrial gene expression
## 2 2 ribonucleoprotein complex biogenesis
## 3 3 mitotic cell cycle phase transition
## 4 4 DNA replication
## 5 5 ribonucleoprotein complex biogenesis
## 6 6 chromosome segregation
# Plot CDK2 RPKM histogram
cdk2_rpkm = rpkm["Cdk2",]
cdk2_df = data.frame(Group = conditions, RPKM = cdk2_rpkm)
summary_cdk2_df = cdk2_df %>%
group_by(Group) %>%
summarise(
mean_RPKM = mean(RPKM),
sd_RPKM = sd(RPKM)
)
ggplot(summary_cdk2_df, aes(x = Group, y = mean_RPKM, fill = Group)) +
geom_bar(stat = "identity", position = position_dodge(), width = 0.7) +
geom_errorbar(aes(ymin = mean_RPKM - sd_RPKM, ymax = mean_RPKM + sd_RPKM),
width = 0.2, position = position_dodge(0.7)) +
theme_minimal() +
labs(title = "RPKM Values by Group with SD Error Bars", x = "Group", y = "Mean RPKM") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))